# Code purpose: clean raw data for analysis
# Input file: LyonMalik_Raw_Data.sav
# Output file: LyonMalik_Policing_Data.csv


# Load Data and Libraries ------------------------------------------------------

# Load Libraries
library("tidyverse", "rio")

# Load Raw Survey Data 
root <- "ADD YOUR PATH HERE" # set path to replication file folder 
raw_data <- import(paste0(root, "/LyonMalik_Raw_Data.sav"))


# Clean Data Needed for Analysis ------------------------------------------

raw_data %>%
  mutate(
    # Make respondent ID variable
    resp_id = M1,
    # Make respondent ethnicity variable
    ethnicity = case_when(
      d5 == 1 ~ "Muhajir",
      d5 == 2 ~ "Muhajir",
      d5 == 3 ~ "Muhajir",
      d5 == 4 ~ "Muhajir",
      d5 == 5 ~ "Punjabi",
      d5 == 6 ~ "Sindhi",
      d5 == 7 ~ "Balochi",
      d5 == 9 ~ "Pashtun",
      d5 == 8 ~ "Others",
      d5 == 10 ~ "Others",
      d5 == 11 ~ "Others"
    ),
    # Create ethnicity indicators for 3 experimental groups
    muhajir = ifelse(ethnicity == "Muhajir", 1, 0),
    pashtun = ifelse(ethnicity == "Pashtun", 1, 0),
    sindhi = ifelse(ethnicity == "Sindhi", 1, 0),
    # Create non-Sindhi indicator
    non_sindhi = ifelse(sindhi == 1, 0, 1), 
    # Create variable for randomized treatment
    # This variable = the randomized ethnicity of the police officer 
    # profile that is shown to the respondent 
    pol_ethnicity = case_when(
      Randomization_Q6 == 1 ~ "Sindhi",
      Randomization_Q6 == 2 ~ "Pashtun",
      Randomization_Q6 == 3 ~ "Muhajir"
    ),
    # Create an indicator variable for whether or not randomized police 
    # profile is coethnic of respondent
    coeth_police = case_when(
      ethnicity == "Sindhi" & pol_ethnicity == "Sindhi" ~ 1,
      ethnicity == "Pashtun" & pol_ethnicity == "Pashtun" ~ 1,
      ethnicity == "Muhajir" & pol_ethnicity == "Muhajir" ~ 1,
      # Ensure that ethnic groups with zero probability of assignment to 
      # treatment (i.e. those who are not Sindhi, Pashtun, or Muhajir) 
      # are set to missing on the treatment variable
      ethnicity != "Sindhi" & ethnicity != "Pashtun" & ethnicity != "Muhajir" ~ NA_real_,
      TRUE ~ 0 
    ),
    # Set nonresponse (98) and don't know (99) to missing on outcomes
    police_listen = ifelse(Q6_a == 98 | Q6_a == 99, NA, Q6_a),
    police_fair = ifelse(Q6_b == 98 | Q6_b == 99, NA, Q6_b),
    police_preferential = ifelse(Q6_c == 98 | Q6_c == 99, NA, Q6_c),
    # Set nonresponse (98) and don't know (99) to missing on trust in police
    police_trust = ifelse(q15_4 == 98 | q15_4 == 99, NA, q15_4),
    # Set nonresponse (98) and don't know (99) to missing on trust in provincial gov
    prov_trust = ifelse(q15_9 == 98 | q15_9 == 99, NA, q15_9),
    # Set nonresponse (98) and don't know (99) to missing on trust in national gov
    nat_trust = ifelse(q15_3 == 98 | q15_3 == 99, NA, q15_3),
    # Create factor variable for primary sampling unit 
    psu = as.factor(M3),
    # Create Muslim indicator
    muslim = ifelse(d7 == 1, 1, 0),
    # Create age variable; note, nonresponse = 98 & doesn't know = 99
    age = ifelse(d1 == 98 | d1 == 99, NA, d1),
    # Create indicator for whether age is 30 or under
    under30 = ifelse(age <= 30, 1, 0),
    # Create measure for number of years family lived at current residence 
    # Note, no nonresponse or doesn't know on this measure
    years_lived = d2_y,
    # Create education variable; note, nonresponse = 98 & doesn't know = 99
    education = ifelse(d4 == 98 | d4 == 99, NA, d4),
    # Create male and female indicators
    # Note, no nonresponse or doesn't know on this measure
    male = ifelse(d9 == 1, 1, 0),
    female = ifelse(d9 == 2, 1, 0),
    # Create indicator for whether respondent approached police on legal matter
    # in past 5 years, setting nonresponse (98) and doesn't know (99) to missing
    pol_approach = case_when(
      q2a_1 == 1 ~ 1,
      q2a_1 == 2 ~ 0
    ),
    # Create measure for the count of all crimes committed against respondent 
    # or the respondent's household in last 5 years
    # Note, no nonresponse or doesn't know on this measure
    total_crimes = Count_Q4,
    # Create indicator for whether respondent filed an FIR, setting
    # nonresponse (98) and doesn't know (99) to missing
    file_fir = case_when(
      q8 == 1 ~ 1, 
      q8 == 2 ~ 0,
      q8 == 98 ~ NA_real_,
      q8 == 99 ~ NA_real_
    ),
    # Create measure for respondent-estimated number of coethnic police in 
    # local thana. Note, no nonresponse or doesn't know on this measure
    n_coethnic = q8a,
    # Create measure for the number of times the respondent had been 
    # stopped by the police, setting nonresponse (98) and doesn't know (99)
    # to missing
    times_stopped = ifelse(q1a == 98 | q1a == 99, NA, q1a),
    # Using new variable "times_stopped", create an indicator for whether 
    # respondent has ever been stopped by the police
    stopped = case_when(
      times_stopped == 1 ~ 0,
      times_stopped > 1 ~ 1
    ),
    # Create observational measure of whether respondent thinks coethnic
    # Will receive preferential treatment from the police
    # Note, no nonresponse or doesn't know on this measure
    coethnics_preferential = case_when(
      q5a_3 == 1 ~ 1, 
      q5a_3 == 2 ~ 0,
      q5a_3 == 3 ~ 0
    )
    ) %>%
  # Keep the variables needed for analysis
  dplyr::select(resp_id, ethnicity, muhajir, pashtun, sindhi, non_sindhi, 
         pol_ethnicity, coeth_police, police_listen,
         police_fair, police_preferential, police_trust, prov_trust,
         nat_trust, psu, muslim, age, under30, years_lived, education, 
         male, female, pol_approach, total_crimes, file_fir, n_coethnic,
         times_stopped, stopped, coethnics_preferential) %>%
  # Write clean data to csv
  write.csv(., file = paste0(root, "/LyonMalik_Policing_Data.csv"), row.names = F)
